package mosdi.subcommands;

import java.util.List;

import mosdi.fa.FiniteMemoryTextModel;
import mosdi.matching.HorspoolMatcher;
import mosdi.matching.Matcher;
import mosdi.util.Alphabet;
import mosdi.util.CodonAlphabet;
import mosdi.util.Log;
import mosdi.util.NamedSequence;
import mosdi.util.SequenceUtils;

public class CGEmpricStatSubcommand extends Subcommand {
	
	@Override
	public String usage() {
		return
		super.usage()+" [options] <background> <length> <matches>\n" +
		"\n" +
		"  <background> file the background text model is estimated from.\n" +
		"\n" +
		"  Computes an empirical p-value of seeing <matches> number of occurrences\n" +
		"  in a random text of given <length> under a null model estimated from <background>.\n" +
		"\n" +
		"Options:\n" +
		"  -O <order>: Order of background model to be estimated (default: 1)\n" +
		"  -p <pseudocounts>: Pseudocounts for estimation of text model (default: 0.1)\n" +
		"  -c: use conditional p-value, i.e. do not compute probability of observing n or more CGs,\n" +
		"      but the probility of n or more CGs given that the segment starts and ends with CG.\n" +
		"      (Right now implemented as rejection sampling: quite slow/inefficient!)\n" +
		"  -C: Enable \"codon-awareness\". In this case, a codon model estimated\n" +
		"      from the input sequences is used to sample sequences. Also, only CGs in\n" +
		"      frames 1 and 2 are counted\n" +
		"  -i <iterations>: default 10,000";
	}

	@Override
	public String description() {
		return "Computes empirical p-values by sampling.";
	}

	@Override
	public String name() {
		return "empiric-pvalue";
	}
	
	@Override
	public int run(String[] args) {
		parseOptions(args, 3, "O:ci:C");

		// Option dependencies
		// -- none --

		// Mandatory arguments
		String backgroundFilename = getStringArgument(0);
		int length = getIntArgument(1);
		int matches = getIntArgument(2);

		// Options
		int modelOrder = getNonNegativeIntOption("O", 1);
		boolean conditionalPValues = getBooleanOption("c", false);
		int iterations = getPositiveIntOption("i", 10000);
		boolean codonAware = getBooleanOption("C", false);
		double pseudoCounts = this.getRangedDoubleOption("p", 0.0, Double.POSITIVE_INFINITY, 0.1);

		if (codonAware && (length%3!=0)) {
			Log.errorln("Error: If option -C is used, given length must be divisable by three.");
			return 1;
		}
		
		Log.startTimer();
		Alphabet alphabet = Alphabet.getDnaAlphabet();
		FiniteMemoryTextModel textModel;
		try {
			List<NamedSequence> namedSequences = SequenceUtils.readFastaFile(backgroundFilename, alphabet, true);
			if (codonAware) {
				textModel = SequenceUtils.buildCodonModelFromNamedSequences(namedSequences, modelOrder, pseudoCounts);
			} else {
				textModel = SequenceUtils.buildTextModelFromNamedSequences(namedSequences, alphabet.size(), modelOrder, pseudoCounts);
			}
		} catch (Exception e) {
			Log.errorln(e.getMessage());
			return 1;
		}
		Log.restartTimer("Estimating text model from input sequences");
		Matcher matcher = new HorspoolMatcher(alphabet.size(), alphabet.buildIndexArray("CG"));
		int n = 0;
		int m = 0;
		for (int i=0; i<iterations; ++i) {
			if (codonAware) {
				int[] s = CodonAlphabet.codonToDnaText(textModel.generateRandomText(length/3+1));
				s[0] = -1;
				s[1] = -1;
				int[] matchPositions = matcher.findAllMatchPositions(s);
				int[] validMatchPositions = new int[matchPositions.length];
				int k = 0;
				for (int p : matchPositions) {
					if (p%3!=0) validMatchPositions[k++] = p;
				}
				if (conditionalPValues) {
					if (k<1) continue;
					if (validMatchPositions[0]>4) continue;
					if (validMatchPositions[k-1]<s.length-4) continue;
				}
				if (k>=matches) m+=1;
			} else {
				int[] s = textModel.generateRandomText(length);
				if (conditionalPValues) {
					if ((s[0]!=alphabet.getIndex('C')) || (s[1]!=alphabet.getIndex('G')) || (s[length-2]!=alphabet.getIndex('C')) || (s[length-1]!=alphabet.getIndex('G'))) {  
						continue;
					}
				}
				if (matcher.findMatches(s)>=matches) m+=1;
			}
			n += 1;
		}
		Log.stopTimer("Sampling sequences");
		Log.printf(Log.Level.STANDARD, "p-value: %d / %d = %e\n",m,n, ((double)m)/n,args);
		
		return 0;
	}

}
